{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "def clean_author_metadata(file_path):\n",
    "    \"\"\"\n",
    "    清洗 author_metadata.json 中的 photo 字段，为不以 'https' 开头的值补充前缀 'https://huggingface.co/'。\n",
    "    Args:\n",
    "        file_path (str): 文件路径\n",
    "    \"\"\"\n",
    "    with open(file_path, 'r', encoding='utf-8') as f:\n",
    "        author_data = json.load(f)\n",
    "\n",
    "    # 遍历所有作者信息并清洗 photo 字段\n",
    "    for author, details in author_data.items():\n",
    "        photo_url = details.get('photo', '')\n",
    "        if not photo_url.startswith('https'):\n",
    "            # 补充前缀\n",
    "            author_data[author]['photo'] = f\"https://huggingface.co{photo_url}\"\n",
    "\n",
    "    # 写入清洗后的数据回到文件中\n",
    "    with open(file_path, 'w', encoding='utf-8') as f:\n",
    "        json.dump(author_data, f, ensure_ascii=False, indent=4)\n",
    "        print(f\"Data cleaning complete. Updated data saved to {file_path}\")\n",
    "\n",
    "# 示例：指定 author_metadata.json 的路径\n",
    "file_path = \"author_metadata.json\"\n",
    "clean_author_metadata(file_path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "\n",
    "def clean_model_metadata(metadata_file, tree_file, output_file):\n",
    "    \"\"\"\n",
    "    清洗 model_metadata.json，通过填补衍生模型缺失的 'pipeline_tag' 属性。\n",
    "\n",
    "    Args:\n",
    "        metadata_file (str): model_metadata.json 的路径。\n",
    "        tree_file (str): model_tree_raw.json 的路径。\n",
    "        output_file (str): 清洗后数据的保存路径。\n",
    "    \"\"\"\n",
    "\n",
    "    # 检查文件是否存在\n",
    "    if not os.path.exists(metadata_file):\n",
    "        print(f\"错误：文件 {metadata_file} 不存在。\")\n",
    "        return\n",
    "    if not os.path.exists(tree_file):\n",
    "        print(f\"错误：文件 {tree_file} 不存在。\")\n",
    "        return\n",
    "\n",
    "    # 加载 model_metadata.json\n",
    "    with open(metadata_file, 'r', encoding='utf-8') as f:\n",
    "        try:\n",
    "            model_metadata = json.load(f)\n",
    "            print(f\"成功加载 {metadata_file}。\")\n",
    "        except json.JSONDecodeError as e:\n",
    "            print(f\"错误：无法解析 {metadata_file}。错误信息：{e}\")\n",
    "            return\n",
    "\n",
    "    # 加载 model_tree_raw.json\n",
    "    with open(tree_file, 'r', encoding='utf-8') as f:\n",
    "        try:\n",
    "            model_tree_raw = json.load(f)\n",
    "            print(f\"成功加载 {tree_file}。\")\n",
    "        except json.JSONDecodeError as e:\n",
    "            print(f\"错误：无法解析 {tree_file}。错误信息：{e}\")\n",
    "            return\n",
    "\n",
    "    # 记录缺失 'pipeline_tag' 的衍生模型\n",
    "    missing_pipeline_tags = []\n",
    "\n",
    "    # 遍历每个基模型及其衍生模型\n",
    "    for base_model, dependencies in model_tree_raw.items():\n",
    "        # 获取基模型的 'pipeline_tag'，默认为 None\n",
    "        base_pipeline_tag = model_metadata.get(base_model, {}).get('pipeline_tag')\n",
    "\n",
    "        for dep_type, derived_models in dependencies.items():\n",
    "            for derived_model in derived_models:\n",
    "                # 检查衍生模型是否存在于 model_metadata.json 中\n",
    "                if derived_model in model_metadata:\n",
    "                    derived_pipeline_tag = model_metadata[derived_model].get('pipeline_tag')\n",
    "\n",
    "                    if derived_pipeline_tag is None:\n",
    "                        if base_pipeline_tag is not None:\n",
    "                            # 填补衍生模型的 'pipeline_tag' 为基模型的值\n",
    "                            model_metadata[derived_model]['pipeline_tag'] = base_pipeline_tag\n",
    "                            print(f\"更新模型 '{derived_model}' 的 'pipeline_tag' 为基模型 '{base_model}' 的值：'{base_pipeline_tag}'。\")\n",
    "                        else:\n",
    "                            # 如果基模型的 'pipeline_tag' 也是 None，设置为 'nan'\n",
    "                            model_metadata[derived_model]['pipeline_tag'] = 'nan'\n",
    "                            missing_pipeline_tags.append(derived_model)\n",
    "                            print(f\"更新模型 '{derived_model}' 的 'pipeline_tag' 为 'nan'。\")\n",
    "\n",
    "    # 如果存在缺失 'pipeline_tag' 的模型，记录日志\n",
    "    if missing_pipeline_tags:\n",
    "        print(\"\\n警告：以下衍生模型的 'pipeline_tag' 被设置为 'nan'，因为它们的基模型的 'pipeline_tag' 也是缺失的：\")\n",
    "        for model in missing_pipeline_tags:\n",
    "            print(f\"- {model}\")\n",
    "\n",
    "    # 保存清洗后的数据到新的 JSON 文件\n",
    "    with open(output_file, 'w', encoding='utf-8') as f:\n",
    "        json.dump(model_metadata, f, indent=4, ensure_ascii=False)\n",
    "        print(f\"\\n清洗后的数据已保存到 {output_file}。\")\n",
    "\n",
    "    print(\"\\n数据清洗完成。\")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    # 定义文件路径\n",
    "    metadata_file = 'model_metadata.json'\n",
    "    tree_file = 'model_tree_raw.json'\n",
    "    output_file = 'cleaned_model_metadata.json'\n",
    "\n",
    "    # 调用清洗函数\n",
    "    clean_model_metadata(metadata_file, tree_file, output_file)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "viz",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.21"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
